{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "chinese-standard.jsonl.dedup\r\n",
      "filter-malay-rojak-en.jsonl.dedup\r\n",
      "filter-malay-rojak-ms.jsonl.dedup\r\n",
      "filter-malay-rojak-rojak.jsonl.dedup\r\n",
      "filter-twitter-id.jsonl.dedup\r\n",
      "filter-twitter-malay-rojak-id.jsonl.dedup\r\n",
      "filter-twitter-malay-rojak-ms.jsonl.dedup\r\n",
      "filter-twitter-malay-rojak-rojak.jsonl.dedup\r\n",
      "kaskus.jsonl.dedup\r\n",
      "local-mandarin.jsonl.dedup\r\n",
      "others.jsonl.dedup\r\n",
      "others-sample-50k.jsonl.dedup\r\n",
      "others-sample.jsonl.dedup\r\n",
      "prepare-english-en.jsonl.dedup\r\n",
      "prepare-english-ms.jsonl.dedup\r\n",
      "prepare-indon.jsonl.dedup\r\n",
      "prepare-indon-standard.jsonl.dedup\r\n",
      "prepare-malay-ms.jsonl.dedup\r\n",
      "prepare-manglish-en.jsonl.dedup\r\n",
      "prepare-manglish-manglish.jsonl.dedup\r\n",
      "prepare-standard-mandarin.jsonl.dedup\r\n",
      "standard-mandarin.jsonl.dedup\r\n"
     ]
    }
   ],
   "source": [
    "!ls *.dedup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import random\n",
    "from tqdm import tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "train = open('train-fasttext-bahasa-en.txt', 'w')\n",
    "test = open('test-fasttext-bahasa-en.txt', 'w')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "standard_english = [\n",
    "    'prepare-english-en.jsonl.dedup'\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "1999999it [00:03, 644011.63it/s]\n"
     ]
    }
   ],
   "source": [
    "for file in standard_english:\n",
    "    count = 0\n",
    "    with open(file) as fopen:\n",
    "        for l in tqdm(fopen):\n",
    "            data = json.loads(l).strip()\n",
    "            if not len(data):\n",
    "                continue\n",
    "\n",
    "            if random.random() >= 0.95:\n",
    "                f = test\n",
    "            else:\n",
    "                f = train\n",
    "\n",
    "            t = f'__label__english {data}'\n",
    "            f.write(f'{t}\\n')\n",
    "            count += 1\n",
    "            if count >= 2e6:\n",
    "                break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "local_english = [\n",
    "    'prepare-manglish-en.jsonl.dedup',\n",
    "    'filter-malay-rojak-en.jsonl.dedup',\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "999999it [00:01, 652291.03it/s]\n",
      "1314it [00:00, 662019.87it/s]\n"
     ]
    }
   ],
   "source": [
    "for file in local_english:\n",
    "    count = 0\n",
    "    with open(file) as fopen:\n",
    "        for l in tqdm(fopen):\n",
    "            data = json.loads(l).strip()\n",
    "            if not len(data):\n",
    "                continue\n",
    "\n",
    "            if random.random() >= 0.95:\n",
    "                f = test\n",
    "            else:\n",
    "                f = train\n",
    "\n",
    "            t = f'__label__english {data}'\n",
    "            f.write(f'{t}\\n')\n",
    "            count += 1\n",
    "            if count >= 1e6:\n",
    "                break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "manglish = [\n",
    "    'prepare-manglish-manglish.jsonl.dedup',\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "1999999it [00:03, 506369.38it/s]\n"
     ]
    }
   ],
   "source": [
    "for file in manglish:\n",
    "    count = 0\n",
    "    with open(file) as fopen:\n",
    "        for l in tqdm(fopen):\n",
    "            data = json.loads(l).strip()\n",
    "            if not len(data):\n",
    "                continue\n",
    "\n",
    "            if random.random() >= 0.95:\n",
    "                f = test\n",
    "            else:\n",
    "                f = train\n",
    "\n",
    "            t = f'__label__english {data}'\n",
    "            f.write(f'{t}\\n')\n",
    "            count += 1\n",
    "            if count >= 2e6:\n",
    "                break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "standard_indonesian = [\n",
    "    'prepare-indon-standard.jsonl.dedup'\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "49999it [00:01, 46774.36it/s]\n"
     ]
    }
   ],
   "source": [
    "for file in standard_indonesian:\n",
    "    count = 0\n",
    "    with open(file) as fopen:\n",
    "        for l in tqdm(fopen):\n",
    "            data = json.loads(l).strip()\n",
    "            if not len(data):\n",
    "                continue\n",
    "                \n",
    "            data = ' '.join(data.split()[:300])\n",
    "\n",
    "            if random.random() >= 0.95:\n",
    "                f = test\n",
    "            else:\n",
    "                f = train\n",
    "\n",
    "            t = f'__label__bahasa {data}'\n",
    "            f.write(f'{t}\\n')\n",
    "            count += 1\n",
    "            if count >= 50000:\n",
    "                break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "socialmedia_indonesian = [\n",
    "    'prepare-indon.jsonl.dedup',\n",
    "    'filter-twitter-malay-rojak-id.jsonl.dedup',\n",
    "    'kaskus.jsonl.dedup',\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "799999it [00:01, 450744.32it/s]\n",
      "799999it [00:01, 770268.67it/s]\n",
      "799999it [00:01, 735014.03it/s]\n"
     ]
    }
   ],
   "source": [
    "for file in socialmedia_indonesian:\n",
    "    count = 0\n",
    "    with open(file) as fopen:\n",
    "        for l in tqdm(fopen):\n",
    "            data = json.loads(l).strip()\n",
    "            if not len(data):\n",
    "                continue\n",
    "\n",
    "            if random.random() >= 0.95:\n",
    "                f = test\n",
    "            else:\n",
    "                f = train\n",
    "\n",
    "            t = f'__label__bahasa {data}'\n",
    "            f.write(f'{t}\\n')\n",
    "            count += 1\n",
    "            if count >= 8e5:\n",
    "                break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "standard_malay = [\n",
    "    'prepare-english-ms.jsonl.dedup'\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "999999it [00:02, 347386.18it/s]\n"
     ]
    }
   ],
   "source": [
    "for file in standard_malay:\n",
    "    count = 0\n",
    "    with open(file) as fopen:\n",
    "        for l in tqdm(fopen):\n",
    "            data = json.loads(l).strip()\n",
    "            if not len(data):\n",
    "                continue\n",
    "                \n",
    "            data = ' '.join(data.split()[:300])\n",
    "\n",
    "            if random.random() >= 0.95:\n",
    "                f = test\n",
    "            else:\n",
    "                f = train\n",
    "\n",
    "            t = f'__label__bahasa {data}'\n",
    "            f.write(f'{t}\\n')\n",
    "            count += 1\n",
    "            if count >= 1e6:\n",
    "                break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "local_malay = [\n",
    "    'filter-malay-rojak-ms.jsonl.dedup',\n",
    "    'filter-twitter-malay-rojak-ms.jsonl.dedup',\n",
    "    'filter-twitter-malay-rojak-ms.jsonl.dedup',\n",
    "    'filter-malay-rojak-rojak.jsonl.dedup'\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "799999it [00:01, 742893.02it/s]\n",
      "799999it [00:00, 827146.53it/s]\n",
      "799999it [00:00, 837993.89it/s]\n",
      "799999it [00:01, 526790.42it/s]\n"
     ]
    }
   ],
   "source": [
    "for file in local_malay:\n",
    "    count = 0\n",
    "    with open(file) as fopen:\n",
    "        for l in tqdm(fopen):\n",
    "            data = json.loads(l).strip()\n",
    "            if not len(data):\n",
    "                continue\n",
    "\n",
    "            if random.random() >= 0.95:\n",
    "                f = test\n",
    "            else:\n",
    "                f = train\n",
    "\n",
    "            t = f'__label__bahasa {data}'\n",
    "            f.write(f'{t}\\n')\n",
    "            count += 1\n",
    "            if count >= 8e5:\n",
    "                break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "standard_mandarin = [\n",
    "    'chinese-standard.jsonl.dedup', \n",
    "    'standard-mandarin.jsonl.dedup', \n",
    "    'prepare-standard-mandarin.jsonl.dedup'\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "\n",
    "def reject(ori):\n",
    "    s = ori.lower()\n",
    "    s = re.sub('[^A-Za-z ]+', ' ', s)\n",
    "    s = re.sub(r'[ ]+', ' ', s).strip()\n",
    "    return (len(s) / len(ori)) > 0.05"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "79664it [00:00, 347270.54it/s]\n",
      "897991it [00:03, 297744.39it/s]\n",
      "939531it [00:25, 36946.69it/s]\n"
     ]
    }
   ],
   "source": [
    "for file in standard_mandarin:\n",
    "    count = 0\n",
    "    with open(file) as fopen:\n",
    "        for l in tqdm(fopen):\n",
    "            data = json.loads(l).strip()\n",
    "            if not len(data):\n",
    "                continue\n",
    "                \n",
    "            if reject(data):\n",
    "                continue\n",
    "                \n",
    "            data = data[:300]\n",
    "\n",
    "            if random.random() >= 0.95:\n",
    "                f = test\n",
    "            else:\n",
    "                f = train\n",
    "\n",
    "            t = f'__label__other {data}'\n",
    "            f.write(f'{t}\\n')\n",
    "            count += 1\n",
    "            if count >= 8e5:\n",
    "                break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "False"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "reject(\"\\u5356\\u7684\\u4eba \\u8d5a\\u94b1\\u5f88\\u6709\\u6548\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "local_mandarin = [\n",
    "    'local-mandarin.jsonl.dedup'\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\"\\u5356\\u7684\\u4eba \\u8d5a\\u94b1\\u5f88\\u6709\\u6548\"\r\n",
      "\"\\u5514\\u7559\\u624b \\u53d1\\u8868\\u4e8e am \\u5982\\u53ea\\u8ad6lcci\\u61c9\\u8a72\\u662feu\\u5427\\u2026\\u2026\\u3002 \\u6211\\u8bfb\\u5b8clcci \\u8981\\u4feeacca\\u7684\"\r\n"
     ]
    }
   ],
   "source": [
    "!head -n 2 local-mandarin.jsonl.dedup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "760400it [00:04, 159495.85it/s]\n"
     ]
    }
   ],
   "source": [
    "for file in local_mandarin:\n",
    "    count = 0\n",
    "    with open(file) as fopen:\n",
    "        for l in tqdm(fopen):\n",
    "            data = json.loads(l).strip()\n",
    "            if not len(data):\n",
    "                continue\n",
    "                \n",
    "            if reject(data):\n",
    "                continue\n",
    "\n",
    "            if random.random() >= 0.95:\n",
    "                f = test\n",
    "            else:\n",
    "                f = train\n",
    "\n",
    "            t = f'__label__other {data}'\n",
    "            f.write(f'{t}\\n')\n",
    "            count += 1\n",
    "            if count >= 500000:\n",
    "                break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2171290it [00:04, 496143.66it/s]\n"
     ]
    }
   ],
   "source": [
    "count = 0\n",
    "with open('others-sample-50k.jsonl.dedup') as fopen:\n",
    "    for l in tqdm(fopen):\n",
    "        data = json.loads(l).strip()\n",
    "        if not len(data):\n",
    "            continue\n",
    "            \n",
    "        if random.random() >= 0.95:\n",
    "            f = test\n",
    "        else:\n",
    "            f = train\n",
    "        \n",
    "        t = f'__label__other {data}'\n",
    "        f.write(f'{t}\\n')\n",
    "        count += 1\n",
    "        if count >= 1e7:\n",
    "            break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "test.close()\n",
    "train.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "!shuf train-fasttext-bahasa-en.txt > shuf-train-fasttext-bahasa-en.txt\n",
    "!shuf test-fasttext-bahasa-en.txt > shuf-test-fasttext-bahasa-en.txt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "from collections import defaultdict\n",
    "\n",
    "train_label = defaultdict(int)\n",
    "test_label = defaultdict(int)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "15200528it [00:16, 948297.12it/s] \n"
     ]
    }
   ],
   "source": [
    "with open('train-fasttext-bahasa-en.txt') as fopen:\n",
    "    for l in tqdm(fopen):\n",
    "        train_label[l.split()[0]] += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "defaultdict(int,\n",
       "            {'__label__english': 4751568,\n",
       "             '__label__bahasa': 6317187,\n",
       "             '__label__other': 4131773})"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_label"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "800112it [00:00, 930473.31it/s]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "defaultdict(int,\n",
       "            {'__label__english': 249746,\n",
       "             '__label__bahasa': 332813,\n",
       "             '__label__other': 217553})"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "with open('test-fasttext-bahasa-en.txt') as fopen:\n",
    "    for l in tqdm(fopen):\n",
    "        test_label[l.split()[0]] += 1\n",
    "        \n",
    "test_label"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
